\[ \begin{aligned} Var(\alpha X+(1-\alpha)Y)&=\alpha^2 Var(X)+(1-\alpha)^2Var(Y)+2\sigma_{\alpha X(1-\alpha)Y}\\ &=\alpha^2 \sigma_X^2+(1-\alpha)^2\sigma_Y^2+2\alpha(1-\alpha)\sigma_{XY} \\ &=\alpha^2(\sigma_X^2+\sigma_Y^2-2\sigma_{XY})-\alpha(2\sigma_{XY}-2\sigma_Y^2)+\sigma_Y^2 \end{aligned} \] It can be seen that this is an quadratic equation of \(\alpha\) and it can achieve one minimum value when \(\alpha=\frac{-b}{2a}=\frac{\sigma_Y^2-\sigma_{XY}}{\sigma_X^2+\sigma_Y^2-2\sigma_{XY}}\)
\[ P=\frac{n-1}{n} \]
\[ P=\frac{n-1}{n} \]
since the we have calculated the probability that the i-th (i can be any number from 1 to n) bootstrap ovservation is not the j-th observation form original sample is \(P=\frac{n-1}{n}\), we can do a multiply: \[ P=\frac{n-1}{n}\frac{n-1}{n}...\frac{n-1}{n}=(\frac{n-1}{n})^n \]
\[ p=1-(\frac{5-1}{5})^5\approx 0.67 \]
\[ p=1-(\frac{100-1}{100})^100\approx 0.63 \]
\[ p=1-(\frac{10000-1}{10000})^10000\approx 0.63 \]
x=1:100000
y=rep(0,100000)
for(i in 1:100000){
y[i]=1-((i-1)/i)^i
}
plot(x,y)
When n become larger, the probability will converge.
store=rep(NA,10000)
for(i in 1:10000){
store[i]=sum(sample(1:100,rep=TRUE)==4)>0
}
mean(store)
## [1] 0.6269
Almost 63% bootstrap dataset contain 4-th observation and this numerical result is close to the theoretical probability we obtained in (g).
First, we can shuffle the whole dataset, then we split it into 10 folds. Each time, we use one 9 folds as training dataset and the remaining one as validation set. We can repeat this process 10 times and obtain 10 results. Finally, average them.
advantage:Less bias \ disadvantage: More computational cost
advantage: Computational advantage, less variance and more accurate estimate. disadvantage: More bias.
We can use bootstrap to estimate the standard deviationof prediction.
library(ISLR)
glm.fit=glm(default~income+balance,data=Default,family=binomial)
set.seed(1)
train=sample(dim(Default)[1],round(dim(Default)[1]/2),replace=FALSE)
glm.fit1=glm(default~balance+income,data=Default,subset=train,family=binomial)
prob1=predict(glm.fit1,Default[-train,],type='response')
class1=rep('No',length(prob1))
class1[prob1>0.5]='Yes'
mean(class1==Default$default[-train])
## [1] 0.9746
Test error is about 0.03.
result=rep(0,3)
for (i in 2:4){
set.seed(i)
train=sample(dim(Default)[1],round(dim(Default)[1]/2),replace=FALSE)
glm.fit1=glm(default~balance+income,data=Default,subset=train,family=binomial)
prob1=predict(glm.fit1,Default[-train,],type='response')
class1=rep('No',length(prob1))
class1[prob1>0.5]='Yes'
result[i-1]=mean(class1==Default$default[-train])
}
All the prediction accuracies are close to 97%.
set.seed(1)
train=sample(dim(Default)[1],round(dim(Default)[1]/2),replace=FALSE)
glm.fit1=glm(default~balance+income+student,data=Default,subset=train,family=binomial)
prob1=predict(glm.fit1,Default[-train,],type='response')
class1=rep('No',length(prob1))
class1[prob1>0.5]='Yes'
mean(class1==Default$default[-train])
## [1] 0.974
Adding dummy variable for studentswill not lead to a reduction in the test error.
glm.fit=glm(default~income+balance,data=Default,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Standard errors for income and balance are 4.985e-06 and 2.274e-04.
boot.fn<-function(data,index){
return(coef(glm(default~income+balance,data=data,subset=index,family=binomial)))
}
library(boot)
set.seed(1)
boot(Default,boot.fn,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 8.556378e-03 4.122015e-01
## t2* 2.080898e-05 -3.993598e-07 4.186088e-06
## t3* 5.647103e-03 -4.116657e-06 2.226242e-04
Standard errors for income and balance are 4.128e-06 and 2.106e-04.
It can be seen that bootstrap method could provide similar results with standard formula.
glm.fit1=glm(Direction~Lag1+Lag2,data=Weekly,family=binomial)
glm.fit2=glm(Direction~Lag1+Lag2,data=Weekly[-1,],family=binomial)
prob=predict(glm.fit2,Weekly[1,],type='response')
predict=prob>0.5
predict==Weekly[1,]$Direction
## 1
## FALSE
No, wrong classification.
length=dim(Weekly)[1]
result=rep(0,length)
for(i in 1:length){
glm.fit2=glm(Direction~Lag1+Lag2,data=Weekly[-i,],family=binomial)
prob=predict(glm.fit2,Weekly[i,],type='response')
if(predict>0.5){
predict='Up'
}
else{predict='Down'}
result[i]=mean(predict!=Weekly[i,]$Direction)
}
mean(result)
## [1] 0.4444444
error rate is about 0.44.
set.seed(1)
y=rnorm(100)
x=rnorm(100)
y=x-2*x^2+rnorm(100)
n is 100 and p is 1.
\[ y=-2x^2+x+\epsilon \] This is a polynomial regression model where \(\epsilon\) follows N(0,1)
plot(x,y)
y has a quadratic shape of x.
set.seed(1)
error1=rep(0,4)
da=data.frame(x,y)
for(i in 1:4){
length=dim(da)[1]
result=0
for(j in 1:length){
lm.fit2=lm(y~poly(x,i),data=da[-j,])
predict=predict(lm.fit2,da[j,])
result=result+(predict-da$y[j])^2
}
error1[i]=result
}
error1
## [1] 589.0979 108.6596 110.2585 111.4772
set.seed(2)
error2=rep(0,4)
da=data.frame(x,y)
for(i in 1:4){
length=dim(da)[1]
result=0
for(j in 1:length){
lm.fit2=lm(y~poly(x,i),data=da[-j,])
predict=predict(lm.fit2,da[j,])
result=result+(predict-da$y)^2
}
error2[i]=result
}
## Warning in error2[i] <- result: 琚浛鎹㈢殑椤圭洰涓嶆槸鏇挎崲鍊奸暱搴︾殑鍊嶆暟
## Warning in error2[i] <- result: 琚浛鎹㈢殑椤圭洰涓嶆槸鏇挎崲鍊奸暱搴︾殑鍊嶆暟
## Warning in error2[i] <- result: 琚浛鎹㈢殑椤圭洰涓嶆槸鏇挎崲鍊奸暱搴︾殑鍊嶆暟
## Warning in error2[i] <- result: 琚浛鎹㈢殑椤圭洰涓嶆槸鏇挎崲鍊奸暱搴︾殑鍊嶆暟
error2
## [1] 74.40974 518.92721 517.57673 521.87891
Same as (c) because the random seed has no inflences on the samples in temrs of LOOCV.
The second model. It is what we expected because we simulate our data by assuming y and x have a quadratic relationship.
lm.fit2=lm(y~poly(x,i),data=da)
summary(lm.fit2)
##
## Call:
## lm(formula = y ~ poly(x, i), data = da)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8914 -0.5244 0.0749 0.5932 2.7796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1041 -17.549 <2e-16 ***
## poly(x, i)1 2.3164 1.0415 2.224 0.0285 *
## poly(x, i)2 -21.0586 1.0415 -20.220 <2e-16 ***
## poly(x, i)3 -0.3048 1.0415 -0.293 0.7704
## poly(x, i)4 -0.4926 1.0415 -0.473 0.6373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.041 on 95 degrees of freedom
## Multiple R-squared: 0.8134, Adjusted R-squared: 0.8055
## F-statistic: 103.5 on 4 and 95 DF, p-value: < 2.2e-16
Yes, the results agree with the conclusion.
library(MASS)
mean(Boston$medv)
## [1] 22.53281
sd(Boston$medv)/sqrt(dim(Boston)[1]-1)
## [1] 0.4092658
standard error measures how far the sample mean of the data is likely to be from the true population mean
mu<-function(data,index){
return(mean(data$medv[index]))
}
boot(Boston,mu,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = mu, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 -0.05943281 0.3972639
The two results are very close.
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
c(mean(Boston$medv)-2*0.429,mean(Boston$medv)+2*0.429)
## [1] 21.67481 23.39081
The two confidence intervals are very close.
median(Boston$medv)
## [1] 21.2
mu<-function(data,index){
return(median(Boston$medv[index]))
}
boot(Boston,mu,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = mu, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.0195 0.3888532
quantile(Boston$medv,0.1)
## 10%
## 12.75
mu<-function(data,index){
return(quantile(Boston$medv[index],0.1))
}
boot(Boston,mu,100)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston, statistic = mu, R = 100)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.038 0.4519945